In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
%matplotlib inline
In [5]:
df = pd.read_csv("car.csv")
df.head()
Out[5]:
In [6]:
print df.buying.unique()
print df.maint.unique()
print df.doors.unique()
print df.persons.unique()
print df.lug_boot.unique()
print df.safety.unique()
print df.acceptability.unique()
In [7]:
map1 = {'low':1,
'med':2,
'high':3,
'vhigh':4}
map2 = {'small':1,
'med':2,
'big':3}
map3 = {'unacc':1,
'acc':2,
'good':3,
'vgood':4}
map4 = {'2': 2,
'4': 4,
'more': 5}
map5 = {'2': 2,
'3': 3,
'4': 4,
'5more': 5}
In [9]:
features = [c for c in df.columns if c != 'acceptability']
#removing 'acceptability'
df1 = df.copy()
df1.buying= df.buying.map(map1)
df1.maint= df.maint.map(map1)
df1.doors = df.doors.map(map5)
df1.persons = df.persons.map(map4)
df1.lug_boot = df.lug_boot.map(map2)
df1.safety = df.safety.map(map1)
df1.acceptability = df.acceptability.map(map3)
X = df1[features]
y = df1['acceptability']
X.head(10)
#making sure it worked
Out[9]:
In [13]:
from sklearn.cross_validation import train_test_split, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
def evaluate_model(model):
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
a = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)
print cm
print cr
return a
various_models = {}
In [14]:
from sklearn.neighbors import KNeighborsClassifier
a = evaluate_model(KNeighborsClassifier())
In [15]:
from sklearn.grid_search import GridSearchCV
params = {'n_neighbors': range(2,60)}
gsknn = GridSearchCV(KNeighborsClassifier(),
params, n_jobs=-1,
cv=KFold(len(y), n_folds=3, shuffle=True))
In [16]:
gsknn.fit(X, y)
Out[16]:
In [17]:
gsknn.best_params_
Out[17]:
In [18]:
gsknn.best_score_
Out[18]:
In [19]:
evaluate_model(gsknn.best_estimator_)
Out[19]:
In [20]:
various_models['knn'] = {'model': gsknn.best_estimator_,
'score': a}
In [21]:
from sklearn.ensemble import BaggingClassifier
baggingknn = BaggingClassifier(KNeighborsClassifier())
In [22]:
evaluate_model(baggingknn)
Out[22]:
In [23]:
bagging_params = {'n_estimators': [10, 20],
'max_samples': [0.7, 1.0],
'max_features': [0.7, 1.0],
'bootstrap_features': [True, False]}
gsbaggingknn = GridSearchCV(baggingknn,
bagging_params, n_jobs=-1,
cv=KFold(len(y), n_folds=3, shuffle=True))
In [24]:
gsbaggingknn.fit(X, y)
Out[24]:
In [25]:
gsbaggingknn.best_params_
Out[25]:
In [26]:
various_models['gsbaggingknn'] = {'model': gsbaggingknn.best_estimator_,
'score': evaluate_model(gsbaggingknn.best_estimator_)}
In [27]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
various_models['lr'] = {'model': lr,
'score': evaluate_model(lr)}
In [28]:
params = {'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
'penalty': ['l1', 'l2']}
gslr = GridSearchCV(lr,
params, n_jobs=-1,
cv=KFold(len(y), n_folds=3, shuffle=True))
gslr.fit(X, y)
print gslr.best_params_
print gslr.best_score_
various_models['gslr'] = {'model': gslr.best_estimator_,
'score': evaluate_model(gslr.best_estimator_)}
In [48]:
gsbagginglr = GridSearchCV(BaggingClassifier(gslr.best_estimator_),
bagging_params, n_jobs=-1,
cv=KFold(len(y), n_folds=3, shuffle=True))
gsbagginglr.fit(X, y)
print gsbagginglr.best_params_
print gsbagginglr.best_score_
various_models['gsbagginglr'] = {'model': gsbagginglr.best_estimator_,
'score': evaluate_model(gsbagginglr.best_estimator_)}
In [ ]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
various_models['dt'] = {'model': dt,
'score': evaluate_model(dt)}
In [ ]:
params = {'criterion': ['gini', 'entropy'],
'splitter': ['best', 'random'],
'max_depth': [None, 5, 10],
'min_samples_split': [2, 5],
'min_samples_leaf': [1, 2, 3]}
gsdt = GridSearchCV(dt,
params, n_jobs=-1,
cv=KFold(len(y), n_folds=3, shuffle=True))
gsdt.fit(X, y)
print gsdt.best_params_
print gsdt.best_score_
various_models['gsdt'] = {'model': gsdt.best_estimator_,
'score': evaluate_model(gsdt.best_estimator_)}
In [ ]:
gsbaggingdt = GridSearchCV(BaggingClassifier(gsdt.best_estimator_),
bagging_params, n_jobs=-1,
cv=KFold(len(y), n_folds=3, shuffle=True))
gsbaggingdt.fit(X, y)
print gsbaggingdt.best_params_
print gsbaggingdt.best_score_
various_models['gsbaggingdt'] = {'model': gsbaggingdt.best_estimator_,
'score': evaluate_model(gsbaggingdt.best_estimator_)}
In [30]:
from sklearn.svm import SVC
svm = SVC()
various_models['svm'] = {'model': svm,
'score': evaluate_model(svm)}
In [31]:
params = {'C': [0.01, 0.1, 1.0, 10.0, 30.0, 100.0],
'gamma': ['auto', 0.1, 1.0, 10.0],
'kernel': ['linear', 'rbf']}
gssvm = GridSearchCV(svm,
params, n_jobs=-1,
cv=KFold(len(y), n_folds=3, shuffle=True))
gssvm.fit(X, y)
print gssvm.best_params_
print gssvm.best_score_
various_models['gssvm'] = {'model': gssvm.best_estimator_,
'score': evaluate_model(gssvm.best_estimator_)}
In [32]:
gsbaggingsvm = GridSearchCV(BaggingClassifier(gssvm.best_estimator_),
bagging_params, n_jobs=-1,
cv=KFold(len(y), n_folds=3, shuffle=True))
gsbaggingsvm.fit(X, y)
print gsbaggingsvm.best_params_
print gsbaggingsvm.best_score_
various_models['gsbaggingsvm'] = {'model': gsbaggingsvm.best_estimator_,
'score': evaluate_model(gsbaggingsvm.best_estimator_)}
In [44]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
rf = RandomForestClassifier()
various_models['rf'] = {'model': rf,
'score': evaluate_model(rf)}
et = ExtraTreesClassifier()
various_models['et'] = {'model': et,
'score': evaluate_model(et)}
In [45]:
params = {'n_estimators':[3, 5, 10, 50],
'criterion': ['gini', 'entropy'],
'max_depth': [None, 3, 5],
'min_samples_split': [2,5],
'class_weight':[None, 'balanced']}
gsrf = GridSearchCV(RandomForestClassifier(n_jobs=-1),
params, n_jobs=-1,
cv=KFold(len(y), n_folds=3, shuffle=True))
gsrf.fit(X, y)
print gsrf.best_params_
print gsrf.best_score_
various_models['gsrf'] = {'model': gsrf.best_estimator_,
'score': evaluate_model(gsrf.best_estimator_)}
In [47]:
gset = GridSearchCV(ExtraTreesClassifier(n_jobs=-1),
params, n_jobs=-1,
cv=KFold(len(y), n_folds=3, shuffle=True))
gset.fit(X, y)
print gset.best_params_
print gset.best_score_
various_models['gset'] = {'model': gset.best_estimator_,
'score': evaluate_model(gset.best_estimator_)}
In [50]:
scores = pd.DataFrame([(k, v['score']) for k, v in various_models.iteritems()],
columns=['model', 'score']).set_index('model').sort_values('score', ascending=False)
plt.style.use('fivethirtyeight')
scores.plot(kind='bar')
plt.ylim(0.5, 1.05)
scores
Out[50]:
In [51]:
#Repeating the tests on my various models
from sklearn.cross_validation import cross_val_score, StratifiedKFold
def retest(model):
scores = cross_val_score(model, X, y,
cv=StratifiedKFold(y, shuffle=True),
n_jobs=-1)
m = scores.mean()
s = scores.std()
return m, s
for k, v in various_models.iteritems():
cvres = retest(v['model'])
print k,
various_models[k]['cvres'] = cvres
In [52]:
cvscores = pd.DataFrame([(k, v['cvres'][0], v['cvres'][1] ) for k, v in various_models.iteritems()],
columns=['model', 'score', 'error']).set_index('model').sort_values('score', ascending=False)
fig, ax = plt.subplots()
rects1 = ax.bar(range(len(cvscores)), cvscores.score,
yerr=cvscores.error,
tick_label=cvscores.index)
plt.style.use('fivethirtyeight')
ax.set_ylabel('Scores')
plt.xticks(rotation=70)
plt.ylim(0.5, 1.05)
cvscores
Out[52]: